import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
sns.set_style('whitegrid')
# add time converter for matplotlib visualisation
pd.plotting.register_matplotlib_converters()
For the data on wars, we are going to use data obtained from Wikidata Query Service. As we selected only the data in which we are interested in, the dataset is quite small as we can see below:
# Set folder name
wikipedia_path = 'data/wikipedia/'
# Set filenames
state_in_war_file = 'states.json'
political_in_war_file = 'political.json'
terroristic_in_war_file = 'terroristic.json'
# Read the files
wars_states_df = pd.read_json(wikipedia_path + state_in_war_file, orient='records', encoding='utf-8')
wars_political_df = pd.read_json(wikipedia_path + political_in_war_file, orient='records', encoding='utf-8')
wars_terroristic_df = pd.read_json(wikipedia_path + terroristic_in_war_file, orient='records', encoding='utf-8')
For now, we are not interested in the differences between these three categories, but only between state and non-state actors. We therefore only mark whether a participant is state or non-state. Next we stack the dataframes to create a unique one.
# Mark the elements of the DFs as either state or non-state actors
wars_states_df['is_state'] = True
wars_political_df['is_state'] = False
wars_terroristic_df['is_state'] = False
# Stack the dataframes
raw_wars_df = wars_states_df.append(wars_political_df, sort=True).append(wars_terroristic_df, sort=True)
# Rename the columns to a more readable style
renamed_wars_df = raw_wars_df.rename(columns={
'armed_conflictLabel': 'name',
'participantLabel': 'participant',
'countryWarLabel': 'country',
'locationLabel': 'location'
})
print(f"Length of wars dataframe: {renamed_wars_df.size}")
The dataframe can be kept in memory.
For food-related data, we are using data from FAOSTAT website. Here we can query FAO database as well and can download only the datasets we need. Particularly, we are interested in the following datasets:
These datasets are quite small and can be kept in memory.
# Set folder name
fao_path = 'data/fao/'
# Set filenames
crops_production_file = 'crops_production_aggregated.csv'
livestock_production_file = 'livestock_production.csv'
population_file = 'population.csv'
land_use_file = 'land_use.csv'
# Read the files
raw_crops_production_df = pd.read_csv(fao_path + crops_production_file, encoding='latin-1')
raw_livestock_production_df = pd.read_csv(fao_path + livestock_production_file, encoding='latin-1')
raw_population_df = pd.read_csv(fao_path + population_file, encoding='latin-1')
raw_land_use_df = pd.read_csv(fao_path + land_use_file, encoding='latin-1')
# Create an array with the names to iterate over the dataframes
agricultural_dfs_names = ['crops', 'livestock', 'population', 'land']
# Create an array of dataframes to easily iterate over them
agricultural_dfs = [
raw_crops_production_df,
raw_livestock_production_df,
raw_population_df,
raw_land_use_df
]
# Iterate over names and dataframes to print the size
for name, df in zip(agricultural_dfs_names, agricultural_dfs):
print(f"Length of {name} dataframe: {df.size}")
All these datasets can be easily handled by a laptop.
Now, let us check the types of the columns of the dataframes:
renamed_wars_df.dtypes
The other two war dataframes have the same formats. We can see that the years have different types. It can be useful to convert them to datetime years. We are doing it after dealing with missing values.
for name, df in zip(agricultural_dfs_names, agricultural_dfs):
print(f"Types in {name} dataframe:\n {df.dtypes}\n")
Let us check whether there are missing values in the wars dataframe:
print("Number of wars with missing data: " +
str(renamed_wars_df[renamed_wars_df.isnull().any(axis=1)].groupby('name').count().shape[0]))
print("Columns with missing data:")
renamed_wars_df.columns[renamed_wars_df.isnull().any()].tolist()
Let us check which entries have missing end year:
missing_end_year_wars = (
# Get the lines missing `end_year`
renamed_wars_df
.loc[
# Get the rows missing end year
renamed_wars_df['end_year'].isnull(),
# Select the name and start year
['name', 'start_year']
]
# Remove duplicates
.drop_duplicates()
)
missing_end_year_wars
Among these wars, only the Shelling of Yeonpyeong has come to an end (it has actually a one-day event). All the others are still conflicts which are still ongoing (even the older ones). As a matter of fact, due to how the Wikidata query is made, the wars without an end date have not finished yet. Hence, we can fill the value with the last year in FAO dataset, which is 2017:
from datetime import datetime
# Set end year of Shelling of Yeonpyeong to 2010
renamed_wars_df.loc[renamed_wars_df['name'] == 'Shelling of Yeonpyeong', 'end_year'] = 2010
# Set the current year for the other wars which are ongoing
renamed_wars_df.loc[renamed_wars_df['end_year'].isnull(), 'end_year'] = 2017
# Change start year and end year to datetime objects
renamed_wars_df['start_year'] = pd.to_datetime(renamed_wars_df['start_year'], format='%Y')
renamed_wars_df['end_year'] = pd.to_datetime(renamed_wars_df['end_year'].astype(int), format='%Y')
Now, let us see check how many wars are missing the country where the war took place:
print("Number of wars with missing country: " +
str(renamed_wars_df.loc[renamed_wars_df['country'].isnull(), 'name'].drop_duplicates().shape[0]))
As there are few, we can fill the country manually, and since we are not interested in the precise location of the wars, we can drop the column location.
# Show the unique wars with missing cou
renamed_wars_df.loc[renamed_wars_df['country'].isnull(), ['name', 'country', 'location']].drop_duplicates()
non_null_wars_df = renamed_wars_df.copy()
# Fix missing country
non_null_wars_df.loc[non_null_wars_df['name'] == 'Tet Offensive', 'country'] = 'Vietnam'
non_null_wars_df.loc[non_null_wars_df['name'] == 'Kosovo War', 'country'] = 'Kosovo'
non_null_wars_df.loc[non_null_wars_df['name'] == 'Soviet–Afghan War', 'country'] = 'Afghanistan'
non_null_wars_df.loc[non_null_wars_df['name'] == 'Cambodian Civil War', 'country'] = 'Cambodia'
non_null_wars_df.loc[non_null_wars_df['name'] == 'First Congo War', 'country'] = 'Democratic Republic of the Congo'
non_null_wars_df.loc[non_null_wars_df['name'] == 'Djiboutian Civil War', 'country'] = 'Djibouti'
non_null_wars_df.loc[non_null_wars_df['name'] == 'Libyan–Egyptian War', 'country'] = 'Libya,Egypt'
non_null_wars_df.loc[non_null_wars_df['name'] == 'Paquisha War', 'country'] = 'Ecuador,Peru'
non_null_wars_df.loc[non_null_wars_df['name'] == 'Sino-Vietnamese War', 'country'] = 'Vietnam'
non_null_wars_df.loc[non_null_wars_df['name'] == '1982 Ethiopian–Somali Border War', 'country'] = 'Somalia'
non_null_wars_df.loc[non_null_wars_df['name'] == 'Ceneps Guar', 'country'] = 'Ecuador,Peru'
non_null_wars_df.loc[non_null_wars_df['name'] == 'Second Kurdish–Iraqi War', 'country'] = 'Iraq'
non_null_wars_df.loc[non_null_wars_df['name'] == 'Vietnam War', 'country'] = 'Vietnam'
non_null_wars_df.loc[non_null_wars_df['name'] == 'Six-Day War', 'country'] = 'Israel,Egypt,Palestine,Syria'
non_null_wars_df.loc[non_null_wars_df['name'] == 'Tell Abyad offensive', 'country'] = 'Syria'
non_null_wars_df.loc[non_null_wars_df['name'] == '2006 Lebanon War', 'country'] = 'Lebanon,Israel'
non_null_wars_df.loc[non_null_wars_df['name'] == 'Gaza–Israel clashes (May 2019)', 'country'] = 'Israel,Palestine'
non_null_wars_df.loc[non_null_wars_df['name'] == 'Tell Abyad offensive', 'country'] = 'Syria'
non_null_wars_df.loc[non_null_wars_df['name'] == '2006 Lebanon War', 'country'] = 'Lebanon,Israel'
non_null_wars_df.loc[non_null_wars_df['name'] == 'Gaza–Israel clashes (May 2019)', 'country'] = 'Israel,Palestine'
non_null_wars_df.loc[non_null_wars_df['name'] == '2003 invasion of Iraq', 'country'] = 'Iraq'
non_null_wars_df.loc[non_null_wars_df['name'] == '2014 Israel–Gaza conflict', 'country'] = 'Palestine,Israel'
non_null_wars_df.loc[non_null_wars_df['name'] == '2018–19 Gaza border protests', 'country'] = 'Palestine,Israel'
non_null_wars_df.loc[non_null_wars_df['name'] == 'Croatian War of Independence', 'country'] = 'Croatia'
non_null_wars_df.loc[non_null_wars_df['name'] == 'Gaza–Israel conflict', 'country'] = 'Palestine, Israel'
non_null_wars_df.loc[non_null_wars_df['name'] == 'Insurgency in the Maghreb (2002–)', 'country'] = '''
Algeria,Libya,Mauritania,Morocco,Tunisia,Western Sahara'''
non_null_wars_df.loc[non_null_wars_df['name'] == "Lord's Resistance Army insurgency", 'country'] = '''
Uganda,South Sudan,Democratic Republic of the Congo,Central African Republic'''
# Eplode entries with multiple locations
non_null_wars_df['country'] = non_null_wars_df['country'].str.split(',')
exploded_wars_df = non_null_wars_df.reset_index().explode('country').drop('index', axis=1)
# Remove dangling spaces which are left
stripped_wars_df = exploded_wars_df.copy()
stripped_wars_df['country'] = exploded_wars_df['country'].str.strip()
Let us see what is left:
stripped_wars_df[stripped_wars_df['country'].isnull()]
Since the War on Terror and Iran-Israel proxy conflict can be considered as a series of wars, and has no defined location, we drop them.
final_wars_df = stripped_wars_df[(stripped_wars_df['name'] != 'War on Terror') & (stripped_wars_df['name'] != 'Iran–Israel proxy conflict')]
print("Number of wars with missing country: " +
str(final_wars_df.loc[final_wars_df['country'].isnull(), 'name'].drop_duplicates().shape[0]))
Let us first see the different unique in the features, in order to decide if we can drop them:
raw_crops_production_df.groupby('Flag').count()
raw_crops_production_df.groupby('Unit').count()
raw_crops_production_df.groupby('Element').count()
raw_crops_production_df.groupby('Domain').count()
Since we have only one value for those columns, we can drop them. We just need to keep in mind that the unit used is tonnes. Let us drop useless columns, and rename the remaing ones to be more consistent with the wars dataframe naming.
def drop_useless_columns(fao_df):
# Keep only useful columns
fao_df = fao_df[['Area', 'Item', 'Year', 'Value']]
fao_df = fao_df.rename(columns={
'Area': 'country',
'Item': 'item',
'Year': 'year',
'Value': 'value'
})
# Convert the year to a datetime object
fao_df['year'] = pd.to_datetime(fao_df['year'], format='%Y')
return fao_df
dropped_agricultural_dfs = list(map(drop_useless_columns, agricultural_dfs))
Now, let us see if there are missing values:
for name, df in zip(agricultural_dfs_names, dropped_agricultural_dfs):
print("In the {name} dataset there are {missing} missing values.".format(
name=name,
missing=df[df.isnull().any(axis=1)].size
))
# Saving the clean versions of the agricultural dfs
final_crop_df = dropped_agricultural_dfs[0]
final_livestock_df = dropped_agricultural_dfs[1]
final_population_df = dropped_agricultural_dfs[2]
final_land_df = dropped_agricultural_dfs[3]
We can see that many values are missing. However, since we are going to aggregate the data, we assume that the impact of missing data will be low. Moreover, we will need to know in which years we are missing data, in order to weight better the averages we are going to compute. When we will need more specific data about some particular countries, we will decide how to handle the problem in a more elaborate way.
As the dataframes are from different sources, the names of the countries might be different. Let us check which countries are in the wars dataframe, and are not in the FAO's one. We can do this by taking the set difference $wars\_countries \setminus fao\_countries$.
not_common_countries = set(final_wars_df['country']).difference(final_crop_df['country'])
not_common_countries
final_crop_df['country'].unique()
# Create a mapping to match names
countries_mapping = {
'Iran (Islamic Republic of)' : 'Iran',
'Republic of Moldova' : 'Moldova',
'Republic of Korea' : 'South Korea',
'Bolivia (Plurinational State of)' : 'Bolivia',
'Ethiopia PDR' : 'Ethiopia',
'Timor-Leste' : 'East Timor',
'Viet Nam': 'Vietnam',
'Syrian Arab Republic': 'Syria',
'USSR': 'Soviet Union',
'Russian Federation': 'Russia',
"Lao People's Democratic Republic": 'Laos',
'Sudan (former)': 'Sudan',
'China' : "People's Republic of China",
'Congo': 'Republic of the Congo',
"Côte d'Ivoire": "Côte d'Ivoire",
'Réunion': 'Réunion',
}
def replace_country_names(fao_df):
return fao_df.replace(countries_mapping)
# Go over agricultural DFs and re-map names
replaced_agricultural_dfs = list(map(replace_country_names, dropped_agricultural_dfs))
# Unpack the array
final_crop_df, final_livestock_df, final_population_df, final_land_df = replaced_agricultural_dfs
# As Palestine has 2 names in the wars DF, we make it uniform across the DF
final_wars_df.replace({'State of Palestine': 'Palestine'}, inplace=True)
Myanmar and Angola are in war for the entire duration of our crop data, then it is not possible to see what influence war had on their agriculture. We have no FAO data for: Artsakh, South Ossetia, Rhodesia, Abkhazia, Kosovo. Finally, there is a mismatch for years where Slovenia have been in war, as in Wikipedia dataframe a war in 1991 is present, but in 1991 Slovenia was still part of Yugoslavia. Them we are removing all these countries from the wars dataframe.
final_wars_df = final_wars_df[final_wars_df['country'] != 'Myanmar']
final_wars_df = final_wars_df[final_wars_df['country'] != 'Angola']
final_wars_df = final_wars_df[final_wars_df['country'] != 'Artsakh']
final_wars_df = final_wars_df[final_wars_df['country'] != 'South Ossetia']
final_wars_df = final_wars_df[final_wars_df['country'] != 'Rhodesia']
final_wars_df = final_wars_df[final_wars_df['country'] != 'Republic of Abkhazia']
final_wars_df = final_wars_df[final_wars_df['country'] != 'Kosovo']
Let us plot the following distributions to further explore our datasets:
Let us start with the number of years in which a country had a war. In order to do this, we first have to avoid that wars overlap in years. For instance, if a country had two wars, one from 1955 to 1970, and another one from 1965 to 1971, we do not want the years from 1965 to 1970 to be counted twice.
We accomplish this by first creating a new column containing all the years of each war, next we explode the column such that we have one row per each year a country had in a specifi war. We finally take in consideration only the country and the years and remove the (overlapping) duplicates.
# Create a df that only contains the wars, the time ranges and the locations
country_wars_df = final_wars_df[['country', 'name', 'start_year', 'end_year']].drop_duplicates()
# Create a new column, range, in which we have all the years in which a country had a war
country_wars_df['years'] = country_wars_df.apply(
# Create a range from the first to the last year of war (we want the last year to be taken in account)
lambda row: list(range(row['start_year'].year, row['end_year'].year + 1)),
axis=1
)
unique_years_df = (
country_wars_df
# Explode the years lists and take in consideration only country and years
.explode('years')[['country', 'years']]
# Remove overlapping years (which are duplicate (country, year) couples)
.drop_duplicates()
# Reset index after explode and drop newly created index column
.reset_index()
.drop('index', axis=1)
)
# Group by country to count the number of years in war the country had and sort
years_per_country = unique_years_df.groupby('country').count().sort_values(by='years').reset_index()
# Finally plot
plt.figure(figsize=(15,15))
plt.grid(True)
plt.barh(years_per_country['country'], years_per_country['years'])
plt.title('Number of years in war for each country')
_ = plt.xlabel('Total number of years the country is in the war')
We can easily see that luckily many countries did not have a war (e.g. European countries); some of them, on the other hand, have been in war for all the 57 years of observations.
But are long wars more or less common than short ones? Let us plot an histogram to make it clearer.
f, ax = plt.subplots(figsize=(15,10))
ax.hist(years_per_country['years'], bins=years_per_country.shape[0] // 2)
ax.set_title('The histogram plot for the number of countries corresponding to the number of years in war')
ax.set_xlabel('Total number of years in war')
_ = ax.set_ylabel('Number of countries')
Luckily enough, we can consider long wars quite exceptional.
Next, we want to plot the total production of the crops per year and per country, as an overall, general trend. In addition, we plot similar figures for the total amount of livestock for all countries, total agricultural area and total population, each per year and per country.
Before doing that let us check if the total periods of observation for each country are the same in order to exclude wrong interpretation of the data.
def sum_total_yearly_quantity(df):
# Group by country and year, and sum the quantity up
return (
df.groupby(['country', 'year'])
.sum()
.reset_index()
)
# Sum the total production in all the datasets
total_productions_per_year_per_country = list(map(sum_total_yearly_quantity, replaced_agricultural_dfs))
# Unpack the list to individual dataframes
crop_per_year_per_country, \
livestock_per_year_per_country, \
population_per_year_per_country, \
land_per_year_per_country = total_productions_per_year_per_country
for name, df in zip(agricultural_dfs_names, total_productions_per_year_per_country):
# Unique values in the total period of observations
years_of_observations = df.groupby('country').count()['year'].unique()
# Check if there is only one unique period (57 years) that corresponds to each country
print('Years of observations for {} are the same for all countries: {}'.format(name, len(years_of_observations) == 1))
As we can see, different countries have different years of collecting the data, let us show an example for the crop production.
# Get the unique numbers of monitored years
crop_years_observation = crop_per_year_per_country.groupby('country').count()['year'].unique()
crop_years_observation
crop_years_observation array represents the different periods of times in which the country collected the crop production data. We see that some countries made observations for 57 years, but some countries observed for 26, 18, etc years. This should affect the interpretation of the total crop production of all countries per year.
Let us see have a quick glance at which countries collected data for a smaller period of time.
counted_data = crop_per_year_per_country.groupby('country').count().drop('value', axis=1)
for observation in crop_years_observation[1:]:
print('{} years of observation for these countries: {}'.format(observation, counted_data[counted_data['year'] == observation].index.tolist()))
We can see that different countries have different periods of observations.
It make sense that the USSR, Czechoslovakia, Yugoslavia SFR, Former Sudan and South Sudan, Pacific Islands Trust Territory and Ethiopia PDR have smaller periods of data collection as they no more exist, and have been replaced by other countries. The same can be said about CIS member states, Czechia, Serbia and other countries have smaller periods as they became independent recently. We can also suppose that data about Belgium and Luxembourg have been gathered together in a unique datapoint each year, until 1999. Hence we can conclude that the discrepancy between data time periods caused by these events does not influence our data.
For other countries (United States Virgin Islands, Marshall Islands, Cayman Islands, etc) we can say that their contributions to the total production of crops, total amount of livestock, population and agricultural land area is unsignificant as they are really small and not global production leaders.
57 years is a period long enough to observe important changes at national level. However, the visualisation of total value of crop production/livestock/land area should not be affected by the changes proviously discussed.
f, ax = plt.subplots(2, 2, figsize=(15, 7))
y_labels = [
'crops production (tonnes)',
'amount of livestock for all countries per year',
'agricultural land area (1000 ha)',
'population (1000 persons)'
]
for i, (df, name) in enumerate(zip(total_productions_per_year_per_country, agricultural_dfs_names)):
prod_per_year = df.groupby('year').sum().reset_index()
ax[i // 2, i % 2].plot_date(x=prod_per_year['year'], y=prod_per_year['value'], linestyle="-", xdate=True);
ax[i // 2, i % 2].set_title(f'Total {name} for all countries per year')
ax[i // 2, i % 2].set_xlabel('Year')
ax[i // 2, i % 2].set_ylabel('Total' + y_labels[i])
f.tight_layout()
As expected, the total amount and production of goods have increased as well as the total population. Also we observe that the discrepancy of the time points of observation, indeed, does not influence the interpretation of the data. Interestingly, in the year 2000 the amount of agricultural land area reached its peak and then decreased.
Our final aim is to compare the livestocks and agricultural production of different countries. Would it be fair though to compare big countries with tiny ones? Densely populated and almost uninhabited ones? We started looking for normalisation criteria. Of course, the bare size of a country is not a good normalization factor. What about Russia and its permafrost areas, or Egypt and its deserts?
We therefore came up with two factors that we believe to be more representative:
The following analysis has been done to understand if these criteria are valid and if we can really obtain more meaningful data by comparing normalised productions (both for area and population ) instead of the absolute one. In order to do so, we examined possible correlations between different observations in our datasets.
First, let us calculate the correlation between total production of crops, the amount of livestock and the agricultural area used over the observed time period. If some time points are present in one dataset and missing in another, then we drop the row that corresponds to this time point. Also, now that we are digging deeper into the data, missing data could affect our results in a non-negligable way. Let us check this.
# Get a set representation of the countries for each dataset
crop_unique_countries = set(crop_per_year_per_country['country'].unique())
livestock_unique_countries = set(livestock_per_year_per_country['country'].unique())
population_unique_countries = set(population_per_year_per_country['country'].unique())
land_unique_countries = set(land_per_year_per_country['country'].unique())
# Get non-common countries
all_countries = crop_unique_countries.union(livestock_unique_countries, population_unique_countries, land_unique_countries)
intersected_set_of_countries = crop_unique_countries.intersection(livestock_unique_countries, population_unique_countries, land_unique_countries)
non_common_countries = all_countries.symmetric_difference(intersected_set_of_countries)
non_common_countries
The countries listed above are the non-overlapping countries, meaning that some data are missing. Most of them are not relevant for our project, hence we can exclude them for this initial analysis.
Now we can merge 4 agricultural datasets into one and calculate the Pearson correlations between 4 different features for each country:
# Merge 4 datasets into one
merged_data = (crop_per_year_per_country
.merge(livestock_per_year_per_country, how='inner',
left_on=['country','year'], right_on = ['country','year'],
suffixes=('_crop', '_livestock'))
.merge(population_per_year_per_country, how='inner',
left_on=['country','year'], right_on = ['country','year'])
.rename(columns={'value' : 'value_population'})
.merge(land_per_year_per_country, how='inner',
left_on=['country','year'], right_on = ['country','year'])
.rename(columns={'value' : 'value_land_area'})
)
merged_data
# Calculate the correlation matrix for each country with 4 features
corr_data = merged_data.groupby('country').corr().fillna(0)
# Getting the paired correlations
crop_land_area_corr = corr_data.xs('value_crop', level=1)['value_land_area']
crop_population_corr = corr_data.xs('value_crop', level=1)['value_population']
livestock_population_corr = corr_data.xs('value_livestock', level=1)['value_population']
corr_data
A quick glance at the correlation matrix shows that some countries seem to rely more on livestock than on crop production to maitain their population growths. For example, in Afghanistan, we see that the population size has a very strong correlation with crop production and a nearly zero correlation with the total livestock. Looking at Zimbabwe, we see the exact opposite situation. We will plot these correlations in order to visualize the entire dataset.
f, ax = plt.subplots(figsize=(15, 40))
crop_land_area_corr.sort_values().plot.barh(ax=ax)
ax.set_title('Correlation between crop production and agricultural land area')
ax.set_xlabel('Pearson correlation');
f, ax = plt.subplots(figsize=(15, 40))
crop_population_corr.sort_values().plot.barh(ax=ax)
ax.set_title('Correlation between crop production and population')
ax.set_xlabel('Pearson correlation');
f, ax = plt.subplots(figsize=(15, 40))
livestock_population_corr.sort_values().plot.barh(ax=ax)
ax.set_title('Correlation between livestock and agricultural land area')
ax.set_xlabel('Pearson correlation');
Surpisingly, the correlation between our features is very variable with time and this is true for each country.
We can conclude that both the population and the agricultural land area are not good normalising factors. Therefore, we should treat every feature as an independent one for future analysis.
# Export dataframes to CSV for use in other notebooks
# export_csv = merged_data.to_csv(r'pandas_dfs/merged_data.csv', index = None, header=True)
# export_csv = years_per_country.to_csv(r'pandas_dfs/years_per_country.csv', index = None, header=True)
# export_csv = country_wars_df.to_csv(r'pandas_dfs/country_wars_df.csv', index = None, header=True)
# export_csv = final_wars_df.to_csv(r'pandas_dfs/final_wars_df.csv', index = None, header=True)
At this point of our analysis, it is clear that the data we are dealing with is complex and that we need to narrow down our project to a smaller more manageable scope if we are to obtain meaningful results. Therefore, our goal is not to describe the general production trends for all the countries as there are simply too many factors that make each country and each war unique. Because of this, we doubt that we would be able to come to any insightful conclusions with a more general approach.
This is why we have decided to only focus on few cases that have grasped our attention, in order to better estimate the impact of war on their unique situations.
To select countries of interest, we created a plotting function that displays the evolution of crop production during war and non-war years. After visualizing many different countries with our function, we have decided to dig deeper into the Lebanese Civil war. This war falls nicely in the middle of our data time frame, is a civil war and has lasted 15 years.
We now create a plot in which we can select a country for which we want to visualize the production of crops that it had each year. In the plot, non-war years will be marked in red, and war ones are marked in grey, in order to highlight the trend of a country's production and the potential differences that could stand out due to wars.
# Import plotly
import plotly.graph_objects as go
import plotly
plotly.offline.init_notebook_mode(connected=True)
def year_in_war(country_wars, year_row):
"""
Determines whether a country is in war during a specific year
"""
year = year_row['year']
# Iterate over the rows to check if it is a war year
for index, war_row in country_wars.iterrows():
# Return true if the year is in war
if war_row['start_year'] <= year <= war_row['end_year']:
return True
# Return false if the year is in no war
return False
def country_in_war(wars_df, prod_per_year_per_country, country):
"""
Creates a dataframe with data about crops every year, and whether the coutry was in war in that year
"""
country_wars = (
wars_df[wars_df['country'].str.contains(country)]
.groupby(['name', 'start_year', 'end_year'])
.sum()
.reset_index()
.drop('is_state', axis=1)
)
country_crops = (
prod_per_year_per_country[prod_per_year_per_country['country'] == country]
.drop('country', axis=1)
)
country_crops['in_war'] = country_crops.apply(lambda row: year_in_war(country_wars, row), axis=1)
return country_crops
# Create a list of all unique country names in FAO dataset
list_countries = merged_data.country.unique()
# Create figure for Plotly. Here we add two separate traces to our plot in order to be able to color them differently.
#
fig = go.Figure()
# Here we iterate over each country in our list of countries and add two separate traces for each to the plot.
# The first trace are the data points while the country is not in war and the second is for years where there is
# a conflict.
for country in list_countries:
country_prod_war = country_in_war(final_wars_df, merged_data, country)
prod_in_war = country_prod_war.copy()
prod_no_war = country_prod_war.copy()
prod_in_war['value_crop'][~prod_in_war['in_war']] = None
prod_no_war['value_crop'][prod_no_war['in_war']] = None
# Add data for years not in war
fig.add_trace(go.Scatter( x=prod_no_war["year"], y=prod_no_war["value_crop"],
mode='lines+markers',
name='No conflict',
hoverinfo='skip',
connectgaps=False,
visible = False,
line=dict(color='RGB(142, 141, 138)', width=2)))
# Add data for in war years
fig.add_trace(go.Scatter(x=prod_in_war["year"], y=prod_in_war["value_crop"],
mode='lines+markers',
name='In war',
hoverinfo='skip',
connectgaps=False,
visible = False,
line=dict(color='RGB(232, 90, 79)', width=2)))
# Add titles to plot
fig.update_layout(title='Crop Production',
xaxis_title='Year',
yaxis_title='Total Production (Tonnes)',
plot_bgcolor = "White"
)
# Format axes
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='RGB(234, 231, 220)')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='RGB(234, 231, 220)')
# Here we create the list of countries that will appear in our drop down menu in addition to setting the correct
# traces visible for each option
buttons = []
# Create the initial button that will show a clear plot.
buttons.append(dict(method='update',
args=[{'visible': [False]*len(list_countries)*2}],
label='Select Country'))
# Because we need two traces to be visible for each country, we set both the i*2 and i*2+1 visible.
for i in range(0, len(list_countries)):
show = [False]*len(list_countries)*2
show[i*2] = True
show[i*2+1] = True
but = dict(method='update',
args=[{'visible': show}],
label=list_countries[i])
buttons.append(but)
# Here we create the menu using the buttons created above
menus=list([dict(x=0.2, y=1.15, yanchor='top', xanchor='left', buttons= buttons)])
fig.update_layout(legend=dict(x=0.8, y=1.15), updatemenus=menus)
# Run this cell to create an HTML object of the above figure
#plotly.offline.plot(fig, filename='crops-vs-year4.html')
It is clear that each country has its own trend. For some of them (e.g. Kuwait) during wars there has been a drop in production, for others (e.g. Lebanon) there has been a clear raise. There are also countries that had more than one war, and had different trends between the wars. For instance, Afghanistan sees a decrease in production in the war against Soviet Union, and an oscillating raise during the war between Talibans and the United States.
We are now going to plot the average yearly rate of change in crops production. On the Y-axis we plot the average of war years, and on the X-axis we plot the average of non-war years. In this way, countries which were negatively impacted by war should be on the left, and those which had no impact should be on the right.
We also plot as a third dimension the number of years in war using a color scale, to see whether this factor influences the production as well.
def compute_rate(final_wars_df, country, agricultural_data):
"""
Computes the rate of change of crop production from one year to another. Calculated as a % increase and added to a new
column.
"""
try:
# Get the crops data of the country
country_crops = (
agricultural_data[agricultural_data['country'] == country]
.drop(['value_population', 'value_land_area', 'value_livestock'], axis=1)
.reset_index(drop=True)
)
# Get the years in which the country has been at war
years_in_war = country_in_war(final_wars_df, country_crops, country)
# Create a first dummy row to create a shifted column of crop data
first_row = pd.DataFrame({'in_war': False, 'value_crop': 0, 'year': np.NaN}, index=[0])
# Add a column shifted by one year with crops data
country_shifted = (
pd.concat([first_row, years_in_war[:]], sort=False).reset_index(drop=True)
.drop(['year'], axis=1)[:-1]
.rename(columns={'value_crop': 'previous_crop'})
)
rated_crops = pd.concat([country_shifted, country_crops], axis=1)[1:].reset_index(drop=True)
# Compute the rate
rated_crops['rate'] = rated_crops.apply(lambda row: (row['value_crop'] - row['previous_crop'])/row['previous_crop']*100, axis=1)
# Group according to whether the production was during a war or not and compute the mean
grouped_rated_crops = rated_crops.groupby('in_war').mean().reset_index()
# Get the values in war and not in war
rate_in_war = grouped_rated_crops.loc[grouped_rated_crops['in_war'] == True, 'rate'].to_numpy()
rate_not_in_war = grouped_rated_crops.loc[grouped_rated_crops['in_war'] == False, 'rate'].to_numpy()
return rate_in_war[0], rate_not_in_war[0]
except:
# Ignore countries with no wars, as they raise IndexError when returning
pass
# Create copy of final_wars_df in order to keep it as the original
wars_rate_df = final_wars_df.copy()
# Apply the compute rate function to each row in wars_rate_df and split the result into two new columns
wars_rate_df['rate-change'] = wars_rate_df.apply(lambda x: compute_rate(wars_rate_df, x['country'], merged_data), axis=1)
# Split rates column into 2 separate columns and drop the rate-change column
wars_rate_df[['rate-war', 'rate-no-war']] = pd.DataFrame(wars_rate_df['rate-change'].tolist(), index=wars_rate_df.index)
wars_rate_df.drop(['rate-change'], axis=1, inplace=True)
# Add the amount of years each country was in war using the years_per_country dataframe created before
wars_rate_df = pd.merge(wars_rate_df, years_per_country, left_on='country', right_on='country')
In the plot, Moldova and Qatar are two big outliers that had big changes in very short wars (which we suppose have not affected the production, as the wars were short). Then we remove these country, to make the plot smaller and more readable.
wars_rate_df = wars_rate_df[wars_rate_df['country'] != 'Moldova']
wars_rate_df = wars_rate_df[wars_rate_df['country'] != 'Qatar']
# Plot the scatter plot
import plotly.graph_objects as go
fig = go.Figure(data=go.Scatter(
x=wars_rate_df['rate-war'],
y=wars_rate_df['rate-no-war'],
hovertext=wars_rate_df['country'],
mode='markers',
marker=dict(colorscale='Reds',
color = wars_rate_df['years'],
line=dict(width=0.5,
color='Black'),
showscale=True,
colorbar=dict(
title="Years in War"
),
)
))
fig.update_layout(
title_text = 'Rate of Change of Crop Production',
showlegend = False,
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)',
xaxis = go.layout.XAxis(
side = 'bottom',
type = 'linear',
title = 'Rate of change while in war (%)',
mirror = True,
showline = True,
gridcolor = 'rgb(234, 231, 220)',
gridwidth = 1,
fixedrange = True,
zerolinewidth = 1),
yaxis = go.layout.YAxis(
type = 'linear',
title = 'Rate of change while in peace (%)',
mirror = True,
showline = True,
gridcolor = 'rgb(234, 231, 220)',
gridwidth = 2,
fixedrange = True,
zerolinewidth = 1),
)
fig.update_xaxes(zeroline=True, zerolinewidth=2, zerolinecolor='Black')
fig.update_yaxes(zeroline=True, zerolinewidth=2, zerolinecolor='Black')
fig.show()
# Run this cell to create an HTML object of the above figure
#plotly.offline.plot(fig, filename='crop-rate.html')
We can see from the plot that, a part from a few outliers, almost all the countries are nearby the origin, and most of them are in the 1st quadrant, that means that had a mean rate which is positive both in war and in peace. There is no clear trend in the behavior of the rate of change. However, it is worth noting that just two countries had a negative rate during peace, but many more had a negative trend during wars.
It is also interesting to observe that all the outliers have been at war few years. We have two explanations for this:
Interestingly, the countries that has been at war for longer time have similar rates between war and non-war years. It is possible that countries that are more frequently in conflicts have developed better strategies to protect their agricultural production.
However, it is evident that it is hard to generalize the analysis at a global level.
We are now going to plot the different regions of Lebanon and their agricultural production in 1998. We assume no major changes to the quality of soil, nor to the environmental and climatic factors would have occured during the war. Therefore, we assume the proportion of production to have been the same during the war as well. The data of agricultural production come from “Atlas du Liban: Territoires et société” by Éric Verdeil, Ghaleb Faour and Sébastien Velut. Institut français du Proche-Orient /CNRS Liban (2007).
Above this plot, we add the events that happened during the Civil War, year by year.
import json
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import datetime
import plotly.graph_objects as go
import plotly.express as px
import plotly
plotly.offline.init_notebook_mode(connected=True)
def load_geoJson(file_in):
"""
Loads GeoJSON file
"""
with open(file_in) as file_in:
districts = json.load(file_in)
# Extract only the features from the json file
district_features = districts['features']
return district_features
def create_geoDict (geoJson):
"""
Creates a dict from the geoJson file. The chloropleth map requires certain indexs to work properly.
Here we set the 'id' to the district names in order to be able to call them using our dataframe in the plotting
script.
"""
geoJSON=dict(type= 'FeatureCollection',
features = [])
for k, tfeature in enumerate(geoJson):
geo_feature = dict(id=tfeature['properties']['DISTRICT'], type= "Feature")
geo_feature['properties'] = tfeature['properties']
geo_feature['geometry'] = tfeature['geometry']
geoJSON['features'].append(geo_feature)
return geoJSON
# Load our geojson file and pass it through the dict creation file
file_in = 'data/2009_districts.geojson'
districts_geo = load_geoJson(file_in)
districts_geo_dict = create_geoDict (districts_geo)
# Create df in which to hard code values. Here, the data we are using is from a physical Atlas on Lebanon (Reference)
# Hence, we must input the values manually into our dataframe.
districts_list = [d['properties']['DISTRICT'] for d in districts_geo]
districts_df = pd.DataFrame(districts_list)
districts_df.rename(columns={0: "district"}, inplace=True)
districts_df.head()
# Hard code values for each district - Agriculture (in area Hectares Ha)
districts_df.loc[districts_df['district'] == 'Bent Jbayl', 'agriculture'] = 12000
districts_df.loc[districts_df['district'] == 'Nabatiyeh', 'agriculture'] = 13250
districts_df.loc[districts_df['district'] == 'Beirut-three', 'agriculture'] = 0
districts_df.loc[districts_df['district'] == 'Zahleh', 'agriculture'] = 24500
districts_df.loc[districts_df['district'] == 'Aley', 'agriculture'] = 7750
districts_df.loc[districts_df['district'] == 'Baabda', 'agriculture'] = 4000
districts_df.loc[districts_df['district'] == 'Chouf', 'agriculture'] = 6000
districts_df.loc[districts_df['district'] == 'Jbayl', 'agriculture'] = 6750
districts_df.loc[districts_df['district'] == 'Kesrouan', 'agriculture'] = 3000
districts_df.loc[districts_df['district'] == 'Matn', 'agriculture'] = 4000
districts_df.loc[districts_df['district'] == 'Akkar', 'agriculture'] = 37000
districts_df.loc[districts_df['district'] == 'Batroun', 'agriculture'] = 6500
districts_df.loc[districts_df['district'] == 'Bcharreh', 'agriculture'] = 2500
districts_df.loc[districts_df['district'] == 'Koura', 'agriculture'] = 10500
districts_df.loc[districts_df['district'] == 'Miniyeh-Danniyeh', 'agriculture'] = 8750
districts_df.loc[districts_df['district'] == 'Tripoli', 'agriculture'] = 1500
districts_df.loc[districts_df['district'] == 'Zgharta', 'agriculture'] = 9000
districts_df.loc[districts_df['district'] == 'Jezzine', 'agriculture'] = 3750
districts_df.loc[districts_df['district'] == 'Saida', 'agriculture'] = 1000
districts_df.loc[districts_df['district'] == 'Sour', 'agriculture'] = 17000
districts_df.loc[districts_df['district'] == 'Zahrany', 'agriculture'] = 16000
districts_df.loc[districts_df['district'] == 'Beirut-two', 'agriculture'] = 0
districts_df.loc[districts_df['district'] == 'Beirut-one', 'agriculture'] = 0
districts_df.loc[districts_df['district'] == 'Baalbek-Hermel', 'agriculture'] = 77000
districts_df.loc[districts_df['district'] == 'WestBekaa-Rachaya', 'agriculture'] = 30000
districts_df.loc[districts_df['district'] == 'Marjayoun-Hasbaya', 'agriculture'] = 19500
districts_df.head()
# Load the conflict events data
lebanon_events_file = 'data/lebanon_conflicts.json'
with open(lebanon_events_file) as f:
lebanon_conflicts_json = json.load(f)
# Get only the data about the events
lebanon_war_events_json = lebanon_conflicts_json['JSON'][0]['settings']['leaflet'][0]['features']
# Create a DF out of the JSON
lebanon_war_events = pd.DataFrame(lebanon_war_events_json).drop(['cssClass', 'icon', 'type', 'feature_id'], axis=1)
lebanon_war_events.head()
def extract_date(row, start_or_end):
"""
Extracts start or end date from a row of the war events df
"""
assert start_or_end == 'start' or 'end', 'start_or_end should be either "start" or "end"'
# Parse the popup of the event
soup = BeautifulSoup(row['popup'])
# Get either the start or the end
class_ = 'date-display-' + start_or_end
try:
date = soup.findAll('span', {'class': class_})[0]
# If there is no start/end, then the event is only one-day, and then return the date
except IndexError:
class_ = 'date-display-single'
date = soup.findAll('span', {'class': class_})[0]
# Create a datetime object out of it and return
date_time_obj = datetime.datetime.strptime(date.text, '%B %d, %Y')
return date_time_obj
def extract_event(row):
"""
Extracts the event name from the popup.
"""
soup = BeautifulSoup(row['popup'])
event = soup.findAll('a')[0]
return event.text
def extract_location_name(row):
"""
Extracts the location name from the popup
"""
return row['popup'].split('<br />')[2]
# Parse the popup column and drop it
lebanon_war_events['start'] = lebanon_war_events.apply(lambda row: extract_date(row, 'start'), axis=1)
lebanon_war_events['end'] = lebanon_war_events.apply(lambda row: extract_date(row, 'end'), axis=1)
lebanon_war_events['event'] = lebanon_war_events.apply(extract_event, axis=1)
lebanon_war_events['location_name'] = lebanon_war_events.apply(extract_location_name, axis=1)
lebanon_war_events_final = lebanon_war_events.drop('popup', axis=1)
# Sort the events by date
lebanon_war_events_final = lebanon_war_events_final.sort_values(by='start')
# Keep only the start year of the events
lebanon_war_events_final['year'] = lebanon_war_events_final['start'].dt.year
lebanon_war_events_final.drop(columns=['start', 'end'])
#px.set_mapbox_access_token(open('.mapbox_token').read())
px.set_mapbox_access_token('pk.eyJ1IjoibWlrZS1lcGZsIiwiYSI6ImNrNDVmZHc0ZjA4M2MzbW15azFndW90dzEifQ.zJ5nfu7uke4E-mW4Y9hV9g')
# Custom colorscale as the largest value is much bigger than the others. With plotly's standard color scales, everything
# was appearing at the lower colors because of the one outlier
colorscale=[
# Let first 1% (0.01) of the values have color rgb(250, 250, 250)
[0, "rgb(250, 250, 250)"],
[0.05, "rgb(250, 250, 250)"],
[0.05, "rgb(229, 245, 224)"],
[0.07, "rgb(229, 245, 224)"],
[0.07, "rgb(199, 233, 192)"],
[0.10, "rgb(199, 233, 192)"],
[0.10, "rgb(161, 217, 155)"],
[0.12, "rgb(161, 217, 155)"],
[0.12, "rgb(116, 196, 118)"],
[0.17, "rgb(116, 196, 118)"],
[0.17, "rgb(65, 171, 93)"],
[0.22, "rgb(65, 171, 93)"],
[0.22, "rgb(35, 139, 69)"],
[0.30, "rgb(35, 139, 69)"],
[0.30, "rgb(0, 109, 44)"],
[0.45, "rgb(0, 109, 44)"],
[0.45, "rgb(0, 68, 27)"],
[0.65, "rgb(0, 68, 27)"],
[0.65, "rgb(17, 36, 20)"],
[1.0, "rgb(17, 36, 20)"]
]
# Create and add the war events to the figure as points using their coordinates. Use animation frame to see the event locations
# during each year. The coloring in px.scatter_mapbox only works with colorscales. As we want all of our markers to have the
# same color, we apply a value of 1 to each of our points and a colorscale with a suitable upper range color
fig = px.scatter_mapbox(
lebanon_war_events_final,
lon='lon',
lat='lat',
animation_frame='year',
size_max=15,
zoom=7,
color=[1] * lebanon_war_events_final.shape[0],
color_continuous_scale=px.colors.sequential.Peach,
range_color = [0,1],
)
# Format the markers and hide the colorscale as it is not needed in our case
fig.update_traces(
marker=go.scattermapbox.Marker(
size=10,
opacity=0.7,
),
)
fig.update(layout_coloraxis_showscale=False)
# Add the districts from our geojson file and color them with our custom colorscale
fig.add_trace(
go.Choroplethmapbox(
geojson=districts_geo_dict,
locations = districts_df.district ,
z=districts_df['agriculture'],
colorscale=colorscale,
zmin=0, zmax=77000, marker_line_width=1,
showscale = True,
marker_opacity=0.5
)
)
fig.update_layout(
margin=dict(l=0, r=5, t=0, b=0),
)
fig.show()
# Run this cell to create HTML opbject of above plot.
#plotly.offline.plot(fig, filename='plots/map-agri-war-leb4.html')
It is observable that across all the years of the war, the large majority of the events happened nearby Beirut and along the coast, and almost none of them happened in the most fertile areas in the west.
We now plot a map in which the percentage of buildings built during the war in each region of Lebanon. The darker the region, the higher the percentage of buildings.
# Similar to the agricultural data, the new infrastructure data was also obtained from the physical atlas of Lebanon.
# hard code values for each district - Infractructure (% buildings constructed in 1975-1990)
districts_df.loc[districts_df['district'] == 'Bent Jbayl', 'infrastructure'] = 36
districts_df.loc[districts_df['district'] == 'Nabatiyeh', 'infrastructure'] = 56
districts_df.loc[districts_df['district'] == 'Beirut-three', 'infrastructure'] = 0
districts_df.loc[districts_df['district'] == 'Zahleh', 'infrastructure'] = 54
districts_df.loc[districts_df['district'] == 'Aley', 'infrastructure'] = 8
districts_df.loc[districts_df['district'] == 'Baabda', 'infrastructure'] = 8
districts_df.loc[districts_df['district'] == 'Chouf', 'infrastructure'] = 23
districts_df.loc[districts_df['district'] == 'Jbayl', 'infrastructure'] = 20
districts_df.loc[districts_df['district'] == 'Kesrouan', 'infrastructure'] = 26
districts_df.loc[districts_df['district'] == 'Matn', 'infrastructure'] = 13
districts_df.loc[districts_df['district'] == 'Akkar', 'infrastructure'] = 50
districts_df.loc[districts_df['district'] == 'Batroun', 'infrastructure'] = 14
districts_df.loc[districts_df['district'] == 'Bcharreh', 'infrastructure'] = 14
districts_df.loc[districts_df['district'] == 'Koura', 'infrastructure'] = 21
districts_df.loc[districts_df['district'] == 'Miniyeh-Danniyeh', 'infrastructure'] = 26
districts_df.loc[districts_df['district'] == 'Tripoli', 'infrastructure'] = 14
districts_df.loc[districts_df['district'] == 'Zgharta', 'infrastructure'] = 21
districts_df.loc[districts_df['district'] == 'Jezzine', 'infrastructure'] = 22
districts_df.loc[districts_df['district'] == 'Saida', 'infrastructure'] = 79
districts_df.loc[districts_df['district'] == 'Sour', 'infrastructure'] = 64
districts_df.loc[districts_df['district'] == 'Zahrany', 'infrastructure'] = 55
districts_df.loc[districts_df['district'] == 'Beirut-two', 'infrastructure'] = 0
districts_df.loc[districts_df['district'] == 'Beirut-one', 'infrastructure'] = 0
districts_df.loc[districts_df['district'] == 'Baalbek-Hermel', 'infrastructure'] = 52
districts_df.loc[districts_df['district'] == 'WestBekaa-Rachaya', 'infrastructure'] = 37
districts_df.loc[districts_df['district'] == 'Marjayoun-Hasbaya', 'infrastructure'] = 48
districts_df.head()
mapbox_access_token = 'pk.eyJ1IjoibWlrZS1lcGZsIiwiYSI6ImNrNDVmZHc0ZjA4M2MzbW15azFndW90dzEifQ.zJ5nfu7uke4E-mW4Y9hV9g'
# Create figure to apply our geojson and infrastructure values
fig = go.Figure(go.Choroplethmapbox(geojson=districts_geo_dict, locations = districts_df.district ,
z=districts_df['infrastructure'],
colorscale="blues",
text='%',
hoverinfo=['z'] + ['text'],
zmin=0, zmax=75, marker_line_width=1, showscale = True,
marker_opacity=0.5))
fig.update_layout(
mapbox=go.layout.Mapbox(
accesstoken=mapbox_access_token),
mapbox_zoom=7, mapbox_center = {"lon": 36, "lat": 34},
margin={"r":0,"t":5,"l":0,"b":0},
)
fig.show()
It is clear that very few buildings have been built along the coast (with an exception for the south), where many war events have taken place, and many have been built in inner Lebanon, were fewer events happened.
# Run this cell to create HTML opbject of above plot.
# plotly.offline.plot(fig, filename='map-infra-leb2.html')
Now, let us plot how the population of Beirut evolved during the war:
beirut_population = pd.read_csv('data/beirut_population.csv')
beirut_population
war_years = range(1975, 1990)
# Remove projections from 2020 on
beirut_population = beirut_population.loc[4:]
# Get only the years in which lebanon have been at war
beirut_population['in_war'] = beirut_population.apply(lambda row: row['Year'] in war_years, axis=1)
pop_in_war = beirut_population.copy()
pop_no_war = beirut_population.copy()
# Add manually entries to fill the gap that there would be between 1970 and 1975
link_before = pd.DataFrame({
'Year': [1970, 1975],
'Population': [922842, 1500000],
'in_war': [False, False]
})
# Add manually entries to fill the gap that there would be between 1985 and 1990
link_after = pd.DataFrame({
'Year': [1985, 1990],
'Population': [1585107, 1292529],
'in_war': [True, True]
})
pop_no_war = pop_no_war.append(link_before).sort_values(by='Year')
pop_in_war = pop_in_war.append(link_after).sort_values(by='Year')
pop_in_war['Population'][~pop_in_war['in_war']] = None
pop_no_war['Population'][pop_no_war['in_war']] = None
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=pop_no_war["Year"], y=pop_no_war["Population"],
mode='lines+markers',
name='Not in war',
hoverinfo='skip',
connectgaps=False,
line=dict(color='RGB(142, 141, 138)', width=2
)
)
)
fig.add_trace(
go.Scatter(
x=pop_in_war["Year"],
y=pop_in_war["Population"],
mode='lines+markers',
name='In war',
connectgaps=False,
hoverinfo='skip',
line=dict(color='RGB(232, 90, 79)', width=2
)
)
)
fig.update_layout(
title='Beirut Urban Area Population',
xaxis_title='Year',
yaxis_title='Population',
plot_bgcolor = "White",
margin=dict(l=0, r=5, t=45, b=0),
)
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='RGB(234, 231, 220)')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='RGB(234, 231, 220)')
fig.show()
#plotly.offline.plot(fig, filename='plots/beirut-population.html')
We can observe that there is a general positive trend interrupted by the war.
All these plot bring us to believe that agricultural production increased because of internal migration: as war broke out in Beirut and in coastal regions, people escaped in inner regions and started working in the fields, leading to a general increase in crops production.
This is confirmed by Atif Abdallah Kubursi, who in his work “Lebanon’s Agricultural Potential: A Policy Analysis Matrix Approach” (McMaster University and Econometric Research limited), states that the “agricultural sector acted as a buffer sector which absorbed large numbers of people from the urban areas that sought refuge in the rural areas”
In this part of the notebook we get in more detail and investigate how the production of specific crops is influenced by the Civil War in Lebanon. To do that we use FAO dataset with non-aggregated items.
# Read the files
crops_prod_lebanon_specific = pd.read_csv(fao_path + 'crops_production.csv', encoding='latin-1')
# Getting a quick view at the data
crops_prod_lebanon_specific.head(3)
Let us take care of the data a little bit.
# Keeping only relevant columns
crops_prod_lebanon_specific = drop_useless_columns(crops_prod_lebanon_specific)
crops_prod_lebanon_specific.drop(columns='country', inplace=True)
# Changing the year columns into the DateTime format
crops_prod_lebanon_specific['year'] = crops_prod_lebanon_specific['year'].dt.year
# Keeping only the war years
crops_war = crops_prod_lebanon_specific[(crops_prod_lebanon_specific.year >= 1975) & (crops_prod_lebanon_specific.year <= 1990)].copy()
# Keeping the rows with NaN values
null_data = crops_war[crops_war.isnull().any(axis=1)]
# Output new dataframe
items_with_nan = (null_data.groupby('item')
.count()
.drop(columns='value')
.rename(columns={'year':'counts'})
)
# Dropping rows with NaN values (not relevant items)
crops_war.dropna(inplace=True)
# Dropping the items that have missing value in some war years
items_with_year_missed = \
crops_war.item.unique()[~(crops_war.groupby('item').count().year == 16)]
crops_war = crops_war[~crops_war['item'].isin(items_with_year_missed)]
# Getting the wide version of the dataset
crops_war_pivoted = crops_war.pivot(index='item', columns='year', values='value')
# Keeping only top-10 items in boundary years
top_crops = crops_war_pivoted[1975].sort_values(ascending=False).head(10).index.to_list() + \
crops_war_pivoted[1990].sort_values(ascending=False).head(10).index.to_list()
crops_animation = crops_war_pivoted.loc[top_crops].drop_duplicates()
We dropped the items that have missing values as we focus only on top-10 products in the boundary year (1975 and 1990). The final dataset is stored in the crops_animation.
Now we can prepare our dataset for the Flourish visualisation that we use to get an animation of the different crops production over the war year in Lebanon. To do that we do MinMax scaling to compare different crops production. Plus we keep only values from every five years due to high variation in the data (probably because of seasonal harvest of some crops).
# Apply min-max scaling for the whole dataset
crops_animation_scaled = (crops_animation - crops_animation.min().min()).copy()
crops_animation_scaled = (crops_animation / crops_animation.max().max()).copy()
# Keep only quinquennial years
crops_animation_5 = crops_animation_scaled.iloc[:, ::5]
# Adding the fancy image for each item in the animation
images_list = ['https://cdn.pixabay.com/photo/2017/02/26/12/27/oranges-2100108_1280.jpg',
'https://cdn.pixabay.com/photo/2011/03/24/20/16/apple-5880_960_720.jpg',
'https://cdn.pixabay.com/photo/2018/08/22/13/58/grapes-3623694_960_720.jpg',
'https://cdn.pixabay.com/photo/2017/02/05/12/31/lemons-2039830_1280.jpg',
'https://cdn.pixabay.com/photo/2018/07/08/21/35/wheat-3524861_1280.jpg',
'https://cdn.pixabay.com/photo/2016/03/26/16/44/tomatoes-1280859_1280.jpg',
'https://cdn.pixabay.com/photo/2019/04/18/21/46/sugar-beet-4138196_1280.jpg',
'https://cdn.pixabay.com/photo/2016/09/03/20/48/bananas-1642706_1280.jpg',
'https://cdn.pixabay.com/photo/2019/12/04/15/44/cucumber-4672972_1280.jpg',
'https://cdn.pixabay.com/photo/2016/08/11/08/43/potatoes-1585060_1280.jpg',
'https://cdn.pixabay.com/photo/2014/05/30/04/04/olives-357849_1280.jpg',
'https://cdn.pixabay.com/photo/2016/03/05/19/14/bulb-1238332_1280.jpg']
crops_animation_5.loc[:, 'image'] = images_list
# Saving the dataset to the data folder
crops_animation_5.to_csv('data/crops_animation_5.csv')
The crops_animation_5 dataframe is used for the Line chart race visualisation.
To check if the increase in crop production was due to the internal consumption but not to the export we use additional dataset from FAO that contains the trading data of crop and livestock products.
We will start with the reading and preprocessing of the FAOSTAT dataset. The necessary data is here.zip) and it consists of folder with two files: Trade_Crops_Livestock_E_All_Data_(Normalized).csv and Trade_Crops_Livestock_E_Flags.csv. The initial Normalised version of the data will be used for future analysis and should be renamed as trading_data.csv.
The obtained trading_data.csv file is quite big ~ 1.6 GB, so we will not upload it to the remote GitHub version of the project.
# The path to FAO data
fao_path = 'data/fao/'
# Reading the dataset from csv file and leaving only needed columns
trading_data_normalized = pd.read_csv(fao_path + 'trading_data.csv', encoding='latin-1')
trading_data = trading_data_normalized[['Area', 'Item', 'Element', 'Year', 'Unit', 'Value']]
# Renaming columns
trading_data = trading_data.rename(columns={
'Area': 'country',
'Item': 'item',
'Element': 'value_meaning',
'Year': 'year',
'Value': 'value',
'Unit': 'unit'
})
# Convert the year to a datetime object
trading_data['year'] = pd.to_datetime(trading_data['year'], format='%Y')
# Leaving only the Import Quantity and the Export Quantity
trading_data = trading_data[(trading_data['value_meaning'] == 'Import Quantity') | (trading_data['value_meaning'] == 'Export Quantity')]
Let us get the list of countries in the trading_data dataframe.
trading_data['country'].unique()
We can see that in the end of this list we have rows indicating not the countries but the aggregated data (e.g. 'World', 'Africa', 'European Union trade', etc). Let us drop this columns.
# Creating the list of row labels that we need to drop
list_of_aggregated_data_rows = ['World', 'Africa',
'Eastern Africa', 'Middle Africa', 'Northern Africa',
'Southern Africa', 'Western Africa', 'Americas',
'Northern America', 'Central America', 'Caribbean',
'South America', 'Asia', 'Central Asia', 'Eastern Asia',
'Southern Asia', 'South-Eastern Asia', 'Western Asia', 'Europe',
'Eastern Europe', 'Northern Europe', 'Southern Europe',
'Western Europe', 'Oceania', 'Australia & New Zealand',
'Melanesia', 'Micronesia', 'Polynesia', 'EU(12)ex.int',
'EU(15)ex.int', 'EU(25)ex.int', 'EU(27)ex.int', 'European Union',
'European Union (exc intra-trade)', 'Least Developed Countries',
'Land Locked Developing Countries',
'Small Island Developing States',
'Low Income Food Deficit Countries',
'Net Food Importing Developing Countries']
# Drop the unnecessary rows
trading_data = trading_data.set_index('country').drop(index=list_of_aggregated_data_rows).reset_index()
Let us calculate the number of rows with NaN values.
print('The number of rows with NaN values is: %d out of %d' % (trading_data[pd.isnull(trading_data).any(axis=1)].shape[0], trading_data.shape[0]))
We can see that we have a huge number of rows with NaN values. In scope of our project we treat these values as zero, because in most cases we will focus only on the specific major trading products when comparing different countries that do not contain NaN values.
# Filling the NaN values with 0
trading_data.fillna(0, inplace=True)
Now we can separate the import data and the export data for future analysis.
# Creating additional dataframes for keeping import and export data
import_data = trading_data[trading_data['value_meaning'] == 'Import Quantity']
export_data = trading_data[trading_data['value_meaning'] == 'Export Quantity']
Let us see how many distinct trading products do we have in each dataset.
print('The number of distinct trading products in import dataset is: %d' % len(import_data['item'].unique()))
print('The number of distinct trading products in export dataset is: %d' % len(export_data['item'].unique()))
We can see that in our dataset we have a huge diversity of the trading products. It will be convenient to sort them by the average import and export value over the time period - in that case we will have the most important products with the biggest trading (import or export) value above. We will create two additional dataframes import_data_sorted and export_data_sorted which keeps the biggest trading (based on the average import/export value) products above.
# Create the dataframe which contains the biggest import products for each country
import_data_sorted = (import_data.groupby(['country', 'item'])
# calculate the mean for each country and for each item over the time period
.mean()
# then group by country
.groupby(level='country', group_keys=False)
# and sort the items by the average values over the years
.apply(lambda x: x.sort_values(by='value', ascending=False))
# create a column which contains the average import value
# for each item for specific country
.rename(columns={'value': 'average_import_value'})
)
# Create the dataframe which contains the biggest export products for each country
export_data_sorted = (export_data.groupby(['country', 'item'])
# calculate the mean for each country and for each item over the time period
.mean()
# then group by country
.groupby(level='country', group_keys=False)
# and sort the items by the average values over the years
.apply(lambda x: x.sort_values(by='value', ascending=False))
# create a column which contains the average import value
# for each item for specific country
.rename(columns={'value': 'average_export_value'})
)
# Show the country and the biggest import products in a nice way
import_data_sorted.groupby(level='country', group_keys=False).apply(lambda df: df.nlargest(n=3, columns='average_import_value')).head(20)
# Show the country and the biggest export products in a nice way
export_data_sorted.groupby(level='country', group_keys=False).apply(lambda df: df.nlargest(n=3, columns='average_export_value')).head(20)
For future analysis we will focus on the Lebanon. This country has been in civil war (which is called Lebanese Civil War) from 1975 to 1990 which we are particularly interested in this project.
Let us show the top 3 biggest import and export products in Lebanon.
print('The top 3 import products in Lebanon are:', import_data_sorted.loc['Lebanon'].nlargest(n=3, columns='average_import_value').index.tolist(),
'\nThe top 3 export products in Lebanon are:', export_data_sorted.loc['Lebanon'].nlargest(n=3, columns='average_export_value').index.tolist())
# Calculating the summed import and export values for each country
import_summed_data = import_data.groupby(['country', 'year']).sum().rename(columns={'value': 'total_import_value'})
export_summed_data = export_data.groupby(['country', 'year']).sum().rename(columns={'value': 'total_export_value'})
# Focus only on Lebanon
export_lebanon = export_summed_data.reset_index()
export_lebanon = export_lebanon[export_lebanon['country'] == 'Lebanon']
export_lebanon['year']=export_lebanon['year'].dt.year
# Plot a bar chart
import plotly.express as px
# Create a list for the bar colors
years = export_lebanon['year'].to_list()
define_war = lambda year: 'rgb(232, 90, 79)' if (year >= 1975) & (year <= 1990) else 'rgb(142, 141, 138)'
colors = list(map(define_war, years))
fig = go.Figure(data=[go.Bar(
x=export_lebanon['year'].to_list(),
y=export_lebanon['total_export_value'].to_list(),
marker_color=colors,
hoverinfo='skip'
)
])
fig.update_layout(
title='Summed crop export',
xaxis_title='Year',
yaxis_title='Export value (tonnes)',
plot_bgcolor = "White",
margin=dict(l=0, r=5, t=45, b=0),
)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='RGB(234, 231, 220)')
fig.show()
#plotly.offline.plot(fig, filename='plots/export_lebanon_value.html')
According to the plot the crop export almost stayed the same while there was a huge increase in total production (see above). Hence, we may conclude that all the crops were consumed inside the country.